using Dates
using MultivariateStats
using Plots
using NCDatasets
using StatsBase
using Unitful
using DataFrames
using Statistics
Plots.default(; margin=4Plots.mm, size=(700, 400), linewidth=2)Project 1
Bias Correction: Quantile Quantile Mapping
Module 2
Project
1 Setup
2 Precipitation Data: ERA5 Reanalysis Dataset
# Dataset covers daily precipitation (mm) from 1/1/1979 to 12/31/2020
ds_precip = NCDataset("/Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/precip.nc")
println(ds_precip)Dataset: /Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/precip.nc
Group: /
Dimensions
lon = 10
lat = 10
time = 15341
Variables
lon (10)
Datatype: Union{Missing, Float32} (Float32)
Dimensions: lon
Attributes:
_FillValue = NaN
long_name = Longitude
units = degrees_east
axis = X
standard_name = longitude
actual_range = Float32[0.25, 359.75]
coordinate_defines = center
lat (10)
Datatype: Union{Missing, Float32} (Float32)
Dimensions: lat
Attributes:
_FillValue = NaN
actual_range = Float32[89.75, -89.75]
long_name = Latitude
units = degrees_north
axis = Y
standard_name = latitude
coordinate_defines = center
time (15341)
Datatype: Union{Missing, DateTime} (Float64)
Dimensions: time
Attributes:
_FillValue = NaN
long_name = Time
axis = T
standard_name = time
coordinate_defines = start
actual_range = [692496.0, 701232.0]
delta_t = 0000-00-01 00:00:00
avg_period = 0000-00-01 00:00:00
_ChunkSizes = 1
units = hours since 1900-01-01
calendar = proleptic_gregorian
precip (10 × 10 × 15341)
Datatype: Union{Missing, Float32} (Float32)
Dimensions: lon × lat × time
Attributes:
_FillValue = -9.96921e36
var_desc = Precipitation
level_desc = Surface
statistic = Total
parent_stat = Other
long_name = Daily total of precipitation
cell_methods = time: sum
avg_period = 0000-00-01 00:00:00
actual_range = Float32[0.0, 428.02423]
units = mm
valid_range = Float32[0.0, 1000.0]
dataset = CPC Global Precipitation
_ChunkSizes = Int32[1, 360, 720]
missing_value = -9.96921e36
precip_time = ds_precip["time"][:];
precip_lon = ds_precip["lon"][:];
precip_lat = ds_precip["lat"][:];
precip = ds_precip["precip"][:,:,:];precip = precip .* 1u"mm";precip_lat = reverse(precip_lat)
precip = reverse(precip, dims=2);2.1 Precipitation Heatmap: Past vs Present
h1 = heatmap(
precip_lon,
precip_lat,
precip[:, :, 1]';
xlabel="Longitude",
ylabel="Latitude",
title="Precipitation on 1/1/1979"
)
h2 = heatmap(
precip_lon,
precip_lat,
precip[:, :, end]';
xlabel="Longitude",
ylabel="Latitude",
title="Precipitation on 12/31/2020"
)
plot(h1, h2; layout=(1, 2), size=(900, 400))close(ds_precip)closed Dataset
3 Temperature Data: ERA5 Reanalysis Dataset
# Dataset covers daily temperature (Kelvin) from 1/1/1979 to 12/31/2020
ds_temp = NCDataset("/Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/temp.nc")
println(ds_temp)Dataset: /Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/temp.nc
Group: /
Dimensions
longitude = 21
latitude = 33
time = 15341
Variables
longitude (21)
Datatype: Union{Missing, Float64} (Float64)
Dimensions: longitude
Attributes:
_FillValue = NaN
latitude (33)
Datatype: Union{Missing, Float32} (Float32)
Dimensions: latitude
Attributes:
_FillValue = NaN
units = degrees_north
long_name = latitude
time (15341)
Datatype: DateTime (Int64)
Dimensions: time
Attributes:
units = days since 1979-01-01 00:00:00
calendar = proleptic_gregorian
t2m (21 × 33 × 15341)
Datatype: Union{Missing, Float32} (Float32)
Dimensions: longitude × latitude × time
Attributes:
_FillValue = NaN
units = K
long_name = 2 metre temperature
temp_time = ds_temp["time"][:];
temp_lon = ds_temp["longitude"][:];
temp_lat = ds_temp["latitude"][:];
temp = ds_temp["t2m"][:,:,:];temp_lat = reverse(temp_lat)
temp = reverse(temp, dims=2);3.1 Temperature Heatmap: Past vs Present
h1 = heatmap(
temp_lon,
temp_lat,
temp[:, :, 1]';
xlabel="Longitude",
ylabel="Latitude",
title="Temperature on 1/1/1979"
)
h2 = heatmap(
temp_lon,
temp_lat,
temp[:, :, end]';
xlabel="Longitude",
ylabel="Latitude",
title="Temperature on 12/31/2020"
)
plot(h1, h2; layout=(1, 2), size=(900, 400))3.2 Ensuring temperature and precipitation correspond to the same time period
@assert temp_time == precip_timetime_data = Dates.Date.(temp_time)15341-element Vector{Date}:
1979-01-01
1979-01-02
1979-01-03
1979-01-04
1979-01-05
1979-01-06
1979-01-07
1979-01-08
1979-01-09
1979-01-10
1979-01-11
1979-01-12
1979-01-13
⋮
2020-12-20
2020-12-21
2020-12-22
2020-12-23
2020-12-24
2020-12-25
2020-12-26
2020-12-27
2020-12-28
2020-12-29
2020-12-30
2020-12-31
4 Splitting Data: Training Period vs Testing Period
# Performing a typical split ratio (roughly 80:20) for training and testing data collected from 1979 to 2021. With 42 years of data, the training dataset will encompass 32 years and the testing period will be 10 years (2009 to 2021)
split_idx = findfirst(time_data .== time_data[end] - Dates.Year(10))
train_idx = 1:split_idx
test_idx = (split_idx+1):length(time_data);4.1 Testing and Training Period Variables
precip_train = precip[:, :, train_idx];
precip_test = precip[:, :, test_idx];
temp_train = temp[:, :, train_idx];
temp_test = temp[:, :, test_idx];5 Preprocessing Data: Climatology and Mean Centering
function preprocess(temp::Array{T, 3}, temp_reference::Array{T, 3})::AbstractMatrix where T
#Computing anomalies that removes climatology from our matrix to allow greater clarity of temperature variations by removing the influence of climate patterns
climatology = mean(temp_reference, dims=3)
anomalies = temp .- climatology
#Reshaping our temperature dataset to produce a 2D matrix of time (rows) and locations (columns):
temp_mat = reshape(anomalies, size(temp, 1) * size(temp, 2), size(temp, 3))
return temp_mat
endpreprocess (generic function with 1 method)
5.1 Apply to Training and Testing Datasets
#Preprocessing temp_train and temp_test; both are preprocessed to the temp training data so they both use the same climatology
n_lon, n_lat, n_t = size(temp)
temp_training_matrix = preprocess(temp_train, temp_train);
temp_testing_matrix = preprocess(temp_test, temp_train);6 Principle Component Analysis
6.1 Fitting
#Fitting a PCA model to training period to automatically choose the number of principle components
PCA_model = fit(PCA, temp_training_matrix; maxoutdim=10, pratio=0.999);6.2 Plotting Variance Explained by PCs
# Variables for plotting the variance accounted by the principle components
variance_explained = principalvars(PCA_model)
total_var = var(PCA_model)
cumulative_var = cumsum(variance_explained)./total_var10-element Vector{Float32}:
0.9599857
0.98082733
0.98715866
0.9929077
0.99447197
0.9957567
0.99657375
0.9971035
0.9975621
0.99789464
p1 = plot(
variance_explained / total_var;
xlabel="Number of PCs",
ylabel="Fraction of Variance Explained",
label=false,
title="Variance Explained"
)
p2 = plot(
cumulative_var;
xlabel="Number of PCs",
ylabel="Fraction of Variance Explained",
label=false,
title="Cumulative Variance Explained"
)
plot(p1, p2; layout=(1, 2), size=(900, 400))
Note
I chose to select 4 principle components because, after plotting the variance explained, the 4 principle components accounted for a cumulative variance of 0.95 and retains enough information while also reducing the noise of outiers.
pc = projection(PCA_model)[:, 3];6.3 Plotting Spatial Patterns
p = []
for i in 1:4
pc = projection(PCA_model)[:, i]
pc_reshaped = reshape(pc, n_lon, n_lat)'
pi = heatmap(
temp_lon,
temp_lat,
pc_reshaped;
xlabel="Longitude",
ylabel="Latitude",
title="PC $i",
aspect_ratio=:equal,
cmap=:PuOr
)
push!(p, pi)
end
plot(p...; layout=(1, 4), size=(1500, 600))6.4 Plotting Time Series
pc_ts = predict(PCA_model, temp_training_matrix)
Months = Dates.month.(time_data)
custom_xticks = [1, 3, 6, 9, 12]
p = []
for i in 1:4
pi = scatter(
Months,
pc_ts[i, :];
xlabel="Months in Year",
ylabel="PC $i",
title="PC $i",
label=false,
alpha=0.3,
color=:blue,
xticks=(custom_xticks, custom_xticks)
)
push!(p, pi)
end
plot(p...; layout=(1, 4), size=(1100, 600))7 Scatter Plots: Comparison of Influence on Rainfall
avg_precip =
ustrip.(
u"inch", [mean(skipmissing(precip_train[:, :, t])) for t in 1:size(precip_train, 3)]
)
avg_precip = replace(avg_precip, NaN => 0)
p1_idx = findall(avg_precip .> quantile(avg_precip, 0.98))
p1 = scatter(
pc_ts[2, p1_idx],
pc_ts[3, p1_idx];
zcolor=avg_precip[p1_idx],
xlabel="PC 2",
ylabel="PC 3",
markersize=5,
clims=(0, 2.75),
title="Rainy Days",
label=false
)
p2_idx = findall(avg_precip .> quantile(avg_precip, 0.98))
p2 = scatter(
pc_ts[4, p2_idx],
pc_ts[3, p2_idx];
zcolor=avg_precip[p2_idx],
xlabel="PC 4",
ylabel="PC 3",
markersize=5,
clims=(0, 2.75),
title="Rainy Days",
label=false
)
p3_idx = findall(avg_precip .> quantile(avg_precip, 0.98))
p3 = scatter(
pc_ts[4, p3_idx],
pc_ts[2, p3_idx];
zcolor=avg_precip[p3_idx],
xlabel="PC 4",
ylabel="PC 2",
markersize=5,
clims=(0, 2.75),
title="Rainy Days",
label=false
)
plot(p1,p2, p3; size=(1000, 600), link=:both)8 KNN: Resampling Algorithm
function euclidean_distance(x::AbstractVector, y::AbstractVector)::AbstractFloat
return sqrt(sum((x .- y) .^ 2))
end
function nsmallest(x::AbstractVector, n::Int)::Vector{Int}
idx = sortperm(x)
return idx[1:n]
end
function knn(X::AbstractMatrix, X_i::AbstractVector, K::Int)::Tuple{Int,AbstractVector}
# calculate the distances between X_i and each row of X
dist = [euclidean_distance(X_i, X[j, :]) for j in 1:size(X, 1)]
idx = nsmallest(dist, K)
w = 1 ./ dist[idx]
w ./= sum(w)
idx_sample = sample(idx, Weights(w))
return (idx_sample, vec(X[idx_sample, :]))
endknn (generic function with 1 method)
8.1 Combining KNN and PCA
function predict_knn(temp_train, temp_test, precip_train; n_pca::Int)
X_train = preprocess(temp_train, temp_train)
X_test = preprocess(temp_test, temp_train)
# fitting the PCA model to the training temperature data
pca_model = fit(PCA, X_train; maxoutdim=n_pca)
# project the test data onto the PCA basis
train_embedded = predict(pca_model, X_train)
test_embedded = predict(pca_model, X_test)
#use the `knn` function for each point in the test data
precip_pred = map(1:size(X_test, 2)) do i
idx, _ = knn(train_embedded', test_embedded[:, i], 3)
precip_train[:, :, idx]
end
# return a matrix of predictions (precip_pred)
return precip_pred
endpredict_knn (generic function with 1 method)
8.2 Predicted vs. Actual Values
t_sample = rand(1:size(temp_test, 3), 4)
precip_pred = predict_knn(temp_train, temp_test[:, :, t_sample], precip_train; n_pca=4)
p = map(eachindex(t_sample)) do ti
t = t_sample[ti]
y_pred = precip_pred[ti]'
y_actual = precip_test[:, :, t]'
cmax = max(maximum(skipmissing(y_pred)), maximum(skipmissing(y_actual)))
cmax = ustrip(u"mm", cmax)
p1 = heatmap(
precip_lon,
precip_lat,
y_pred;
xlabel="Longitude",
ylabel="Latitude",
title="Predicted",
aspect_ratio=:equal,
clims=(0, cmax)
)
p2 = heatmap(
precip_lon,
precip_lat,
y_actual;
xlabel="Longitude",
ylabel="Latitude",
title="Actual",
aspect_ratio=:equal,
clims=(0, cmax)
)
plot(p1, p2; layout=(2, 1), size=(1000, 400))
end
plot(p...; layout=(2, 3), size=(1400, 1200))9 Quantile-Quantile Mapping: Bias Correction
function knn_quantile_mapping(temp_train, temp_test, precip_train, precip_test; n_pca::Int, n_trees::Int)
# KNN prediction
precip_pred = predict_knn(temp_train, temp_test, precip_train, n_pca=n_pca)
y_pred = precip_pred[1] # Assumes the model returns y_pred directly
# Actual precipitation values
y_actual = precip_test[:,:,1]
# Reshape y_pred for quantile mapping
y_pred_corrected = reshape(y_pred, length(precip_lon)*length(precip_lat))
# Quantile mapping
q = 0.5
quantile_pred = quantile(skipmissing(y_pred), q)
for i in 1:length(y_pred_corrected)
if !ismissing(y_pred_corrected[i])
# Calculates the quantile of the predicted value in the predicted distribution
point_quantile = searchsortedfirst(sort(y_pred_corrected), y_pred_corrected[i]) / length(y_pred_corrected)
# Identifies the quantile of the actual value in the observed distribution
quantile_actual = quantile(skipmissing(y_actual), point_quantile)
# Adjust the predicted value using quantile mapping
y_pred_corrected[i] = quantile_actual
end
end
# Reshape the corrected predictions
y_pred_corrected = reshape(y_pred_corrected, length(precip_lon), length(precip_lat))
return y_pred_corrected
endknn_quantile_mapping (generic function with 1 method)
t_sample = rand(1:size(temp_test, 3), 4)
y_pred_corrected = knn_quantile_mapping(temp_train, temp_test[:, :, t_sample], precip_train, precip_test; n_pca=4, n_trees=100)
p = map(eachindex(t_sample)) do ti
t = t_sample[ti]
y_actual = precip_test[:, :, t]'
cmax = max(maximum(skipmissing(y_pred_corrected)), maximum(skipmissing(y_actual)))
cmax = ustrip(u"mm", cmax)
p1 = heatmap(
precip_lon,
precip_lat,
y_pred_corrected';
xlabel="Longitude",
ylabel="Latitude",
title="Predicted (Quantile Mapping)",
aspect_ratio=:equal,
clims=(0, cmax)
)
p2 = heatmap(
precip_lon,
precip_lat,
y_actual;
xlabel="Longitude",
ylabel="Latitude",
title="Actual",
aspect_ratio=:equal,
clims=(0, cmax)
)
plot(p1, p2; layout=(2, 1), size=(1000, 400))
end
plot(p...; layout=(2, 3), size=(1400, 1200))